The Sparks Foundation¶

Graduate Rotational Internship Programme (GRIP)¶

MAY 2023 Batch¶

Candidate Name : MANISHA SHARMA¶

Task_3 : Exploratory Data Analysis_Retail¶

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv('C://Users//Lenovo//Downloads//SampleSuperstore.csv//SampleSuperstore.csv')
In [3]:
df.head()
Out[3]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
0 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136
1 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820
2 Second Class Corporate United States Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714
3 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310
4 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164
In [4]:
df.tail()
Out[4]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
9989 Second Class Consumer United States Miami Florida 33180 South Furniture Furnishings 25.248 3 0.2 4.1028
9990 Standard Class Consumer United States Costa Mesa California 92627 West Furniture Furnishings 91.960 2 0.0 15.6332
9991 Standard Class Consumer United States Costa Mesa California 92627 West Technology Phones 258.576 2 0.2 19.3932
9992 Standard Class Consumer United States Costa Mesa California 92627 West Office Supplies Paper 29.600 4 0.0 13.3200
9993 Second Class Consumer United States Westminster California 92683 West Office Supplies Appliances 243.160 2 0.0 72.9480
In [5]:
df.shape
Out[5]:
(9994, 13)
In [6]:
df.dtypes
Out[6]:
Ship Mode        object
Segment          object
Country          object
City             object
State            object
Postal Code       int64
Region           object
Category         object
Sub-Category     object
Sales           float64
Quantity          int64
Discount        float64
Profit          float64
dtype: object
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9994 non-null   object 
 1   Segment       9994 non-null   object 
 2   Country       9994 non-null   object 
 3   City          9994 non-null   object 
 4   State         9994 non-null   object 
 5   Postal Code   9994 non-null   int64  
 6   Region        9994 non-null   object 
 7   Category      9994 non-null   object 
 8   Sub-Category  9994 non-null   object 
 9   Sales         9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Discount      9994 non-null   float64
 12  Profit        9994 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1015.1+ KB
In [8]:
df.isnull().sum()
Out[8]:
Ship Mode       0
Segment         0
Country         0
City            0
State           0
Postal Code     0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64
In [9]:
df.duplicated().sum()
Out[9]:
17
In [10]:
df = df.drop_duplicates()
In [11]:
# df.var()
In [12]:
df.cov()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1545644723.py:1: FutureWarning: The default value of numeric_only in DataFrame.cov is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.cov()
Out[12]:
Postal Code Sales Quantity Discount Profit
Postal Code 1.027732e+09 -469412.768497 935.818272 391.989291 -224679.858643
Sales -4.694128e+05 389028.396022 278.765576 -3.645637 70057.067126
Quantity 9.358183e+02 278.765576 4.958001 0.003990 34.565743
Discount 3.919893e+02 -3.645637 0.003990 0.042624 -10.632751
Profit -2.246799e+05 70057.067126 34.565743 -10.632751 54970.478824
In [13]:
df.corr()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1134722465.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr()
Out[13]:
Postal Code Sales Quantity Discount Profit
Postal Code 1.000000 -0.023476 0.013110 0.059225 -0.029892
Sales -0.023476 1.000000 0.200722 -0.028311 0.479067
Quantity 0.013110 0.200722 1.000000 0.008678 0.066211
Discount 0.059225 -0.028311 0.008678 1.000000 -0.219662
Profit -0.029892 0.479067 0.066211 -0.219662 1.000000
In [14]:
df.describe()
Out[14]:
Postal Code Sales Quantity Discount Profit
count 9977.000000 9977.000000 9977.000000 9977.000000 9977.00000
mean 55154.964117 230.148902 3.790719 0.156278 28.69013
std 32058.266816 623.721409 2.226657 0.206455 234.45784
min 1040.000000 0.444000 1.000000 0.000000 -6599.97800
25% 23223.000000 17.300000 2.000000 0.000000 1.72620
50% 55901.000000 54.816000 3.000000 0.200000 8.67100
75% 90008.000000 209.970000 5.000000 0.200000 29.37200
max 99301.000000 22638.480000 14.000000 0.800000 8399.97600
In [15]:
df['Ship Mode'].value_counts()
Out[15]:
Standard Class    5955
Second Class      1943
First Class       1537
Same Day           542
Name: Ship Mode, dtype: int64
In [16]:
df['Segment'].value_counts()
Out[16]:
Consumer       5183
Corporate      3015
Home Office    1779
Name: Segment, dtype: int64
In [17]:
df['Category'].value_counts()
Out[17]:
Office Supplies    6012
Furniture          2118
Technology         1847
Name: Category, dtype: int64
In [18]:
df['Sub-Category'].value_counts()
Out[18]:
Binders        1522
Paper          1359
Furnishings     956
Phones          889
Storage         846
Art             795
Accessories     775
Chairs          615
Appliances      466
Labels          363
Tables          319
Envelopes       254
Bookcases       228
Fasteners       217
Supplies        190
Machines        115
Copiers          68
Name: Sub-Category, dtype: int64
In [19]:
df['Sales'].plot(kind='box')
plt.show()
In [20]:
df['Quantity'].value_counts()
Out[20]:
3     2400
2     2399
5     1230
4     1186
1      899
7      606
6      572
9      258
8      257
10      57
11      34
14      29
13      27
12      23
Name: Quantity, dtype: int64
In [21]:
df['Discount'].plot(kind='kde')
plt.show()
In [22]:
df['Profit'].plot(kind='kde')
plt.show()
In [23]:
df['Country'].unique()
Out[23]:
array(['United States'], dtype=object)
In [24]:
df['City'].unique()
Out[24]:
array(['Henderson', 'Los Angeles', 'Fort Lauderdale', 'Concord',
       'Seattle', 'Fort Worth', 'Madison', 'West Jordan', 'San Francisco',
       'Fremont', 'Philadelphia', 'Orem', 'Houston', 'Richardson',
       'Naperville', 'Melbourne', 'Eagan', 'Westland', 'Dover',
       'New Albany', 'New York City', 'Troy', 'Chicago', 'Gilbert',
       'Springfield', 'Jackson', 'Memphis', 'Decatur', 'Durham',
       'Columbia', 'Rochester', 'Minneapolis', 'Portland', 'Saint Paul',
       'Aurora', 'Charlotte', 'Orland Park', 'Urbandale', 'Columbus',
       'Bristol', 'Wilmington', 'Bloomington', 'Phoenix', 'Roseville',
       'Independence', 'Pasadena', 'Newark', 'Franklin', 'Scottsdale',
       'San Jose', 'Edmond', 'Carlsbad', 'San Antonio', 'Monroe',
       'Fairfield', 'Grand Prairie', 'Redlands', 'Hamilton', 'Westfield',
       'Akron', 'Denver', 'Dallas', 'Whittier', 'Saginaw', 'Medina',
       'Dublin', 'Detroit', 'Tampa', 'Santa Clara', 'Lakeville',
       'San Diego', 'Brentwood', 'Chapel Hill', 'Morristown',
       'Cincinnati', 'Inglewood', 'Tamarac', 'Colorado Springs',
       'Belleville', 'Taylor', 'Lakewood', 'Arlington', 'Arvada',
       'Hackensack', 'Saint Petersburg', 'Long Beach', 'Hesperia',
       'Murfreesboro', 'Layton', 'Austin', 'Lowell', 'Manchester',
       'Harlingen', 'Tucson', 'Quincy', 'Pembroke Pines', 'Des Moines',
       'Peoria', 'Las Vegas', 'Warwick', 'Miami', 'Huntington Beach',
       'Richmond', 'Louisville', 'Lawrence', 'Canton', 'New Rochelle',
       'Gastonia', 'Jacksonville', 'Auburn', 'Norman', 'Park Ridge',
       'Amarillo', 'Lindenhurst', 'Huntsville', 'Fayetteville',
       'Costa Mesa', 'Parker', 'Atlanta', 'Gladstone', 'Great Falls',
       'Lakeland', 'Montgomery', 'Mesa', 'Green Bay', 'Anaheim',
       'Marysville', 'Salem', 'Laredo', 'Grove City', 'Dearborn',
       'Warner Robins', 'Vallejo', 'Mission Viejo', 'Rochester Hills',
       'Plainfield', 'Sierra Vista', 'Vancouver', 'Cleveland', 'Tyler',
       'Burlington', 'Waynesboro', 'Chester', 'Cary', 'Palm Coast',
       'Mount Vernon', 'Hialeah', 'Oceanside', 'Evanston', 'Trenton',
       'Cottage Grove', 'Bossier City', 'Lancaster', 'Asheville',
       'Lake Elsinore', 'Omaha', 'Edmonds', 'Santa Ana', 'Milwaukee',
       'Florence', 'Lorain', 'Linden', 'Salinas', 'New Brunswick',
       'Garland', 'Norwich', 'Alexandria', 'Toledo', 'Farmington',
       'Riverside', 'Torrance', 'Round Rock', 'Boca Raton',
       'Virginia Beach', 'Murrieta', 'Olympia', 'Washington',
       'Jefferson City', 'Saint Peters', 'Rockford', 'Brownsville',
       'Yonkers', 'Oakland', 'Clinton', 'Encinitas', 'Roswell',
       'Jonesboro', 'Antioch', 'Homestead', 'La Porte', 'Lansing',
       'Cuyahoga Falls', 'Reno', 'Harrisonburg', 'Escondido', 'Royal Oak',
       'Rockville', 'Coral Springs', 'Buffalo', 'Boynton Beach',
       'Gulfport', 'Fresno', 'Greenville', 'Macon', 'Cedar Rapids',
       'Providence', 'Pueblo', 'Deltona', 'Murray', 'Middletown',
       'Freeport', 'Pico Rivera', 'Provo', 'Pleasant Grove', 'Smyrna',
       'Parma', 'Mobile', 'New Bedford', 'Irving', 'Vineland', 'Glendale',
       'Niagara Falls', 'Thomasville', 'Westminster', 'Coppell', 'Pomona',
       'North Las Vegas', 'Allentown', 'Tempe', 'Laguna Niguel',
       'Bridgeton', 'Everett', 'Watertown', 'Appleton', 'Bellevue',
       'Allen', 'El Paso', 'Grapevine', 'Carrollton', 'Kent', 'Lafayette',
       'Tigard', 'Skokie', 'Plano', 'Suffolk', 'Indianapolis', 'Bayonne',
       'Greensboro', 'Baltimore', 'Kenosha', 'Olathe', 'Tulsa', 'Redmond',
       'Raleigh', 'Muskogee', 'Meriden', 'Bowling Green', 'South Bend',
       'Spokane', 'Keller', 'Port Orange', 'Medford', 'Charlottesville',
       'Missoula', 'Apopka', 'Reading', 'Broomfield', 'Paterson',
       'Oklahoma City', 'Chesapeake', 'Lubbock', 'Johnson City',
       'San Bernardino', 'Leominster', 'Bozeman', 'Perth Amboy',
       'Ontario', 'Rancho Cucamonga', 'Moorhead', 'Mesquite', 'Stockton',
       'Ormond Beach', 'Sunnyvale', 'York', 'College Station',
       'Saint Louis', 'Manteca', 'San Angelo', 'Salt Lake City',
       'Knoxville', 'Little Rock', 'Lincoln Park', 'Marion', 'Littleton',
       'Bangor', 'Southaven', 'New Castle', 'Midland', 'Sioux Falls',
       'Fort Collins', 'Clarksville', 'Sacramento', 'Thousand Oaks',
       'Malden', 'Holyoke', 'Albuquerque', 'Sparks', 'Coachella',
       'Elmhurst', 'Passaic', 'North Charleston', 'Newport News',
       'Jamestown', 'Mishawaka', 'La Quinta', 'Tallahassee', 'Nashville',
       'Bellingham', 'Woodstock', 'Haltom City', 'Wheeling',
       'Summerville', 'Hot Springs', 'Englewood', 'Las Cruces', 'Hoover',
       'Frisco', 'Vacaville', 'Waukesha', 'Bakersfield', 'Pompano Beach',
       'Corpus Christi', 'Redondo Beach', 'Orlando', 'Orange',
       'Lake Charles', 'Highland Park', 'Hempstead', 'Noblesville',
       'Apple Valley', 'Mount Pleasant', 'Sterling Heights', 'Eau Claire',
       'Pharr', 'Billings', 'Gresham', 'Chattanooga', 'Meridian',
       'Bolingbrook', 'Maple Grove', 'Woodland', 'Missouri City',
       'Pearland', 'San Mateo', 'Grand Rapids', 'Visalia',
       'Overland Park', 'Temecula', 'Yucaipa', 'Revere', 'Conroe',
       'Tinley Park', 'Dubuque', 'Dearborn Heights', 'Santa Fe',
       'Hickory', 'Carol Stream', 'Saint Cloud', 'North Miami',
       'Plantation', 'Port Saint Lucie', 'Rock Hill', 'Odessa',
       'West Allis', 'Chula Vista', 'Manhattan', 'Altoona', 'Thornton',
       'Champaign', 'Texarkana', 'Edinburg', 'Baytown', 'Greenwood',
       'Woonsocket', 'Superior', 'Bedford', 'Covington', 'Broken Arrow',
       'Miramar', 'Hollywood', 'Deer Park', 'Wichita', 'Mcallen',
       'Iowa City', 'Boise', 'Cranston', 'Port Arthur', 'Citrus Heights',
       'The Colony', 'Daytona Beach', 'Bullhead City', 'Portage', 'Fargo',
       'Elkhart', 'San Gabriel', 'Margate', 'Sandy Springs', 'Mentor',
       'Lawton', 'Hampton', 'Rome', 'La Crosse', 'Lewiston',
       'Hattiesburg', 'Danville', 'Logan', 'Waterbury', 'Athens',
       'Avondale', 'Marietta', 'Yuma', 'Wausau', 'Pasco', 'Oak Park',
       'Pensacola', 'League City', 'Gaithersburg', 'Lehi', 'Tuscaloosa',
       'Moreno Valley', 'Georgetown', 'Loveland', 'Chandler', 'Helena',
       'Kirkwood', 'Waco', 'Frankfort', 'Bethlehem', 'Grand Island',
       'Woodbury', 'Rogers', 'Clovis', 'Jupiter', 'Santa Barbara',
       'Cedar Hill', 'Norfolk', 'Draper', 'Ann Arbor', 'La Mesa',
       'Pocatello', 'Holland', 'Milford', 'Buffalo Grove', 'Lake Forest',
       'Redding', 'Chico', 'Utica', 'Conway', 'Cheyenne', 'Owensboro',
       'Caldwell', 'Kenner', 'Nashua', 'Bartlett', 'Redwood City',
       'Lebanon', 'Santa Maria', 'Des Plaines', 'Longview',
       'Hendersonville', 'Waterloo', 'Cambridge', 'Palatine', 'Beverly',
       'Eugene', 'Oxnard', 'Renton', 'Glenview', 'Delray Beach',
       'Commerce City', 'Texas City', 'Wilson', 'Rio Rancho', 'Goldsboro',
       'Montebello', 'El Cajon', 'Beaumont', 'West Palm Beach', 'Abilene',
       'Normal', 'Saint Charles', 'Camarillo', 'Hillsboro', 'Burbank',
       'Modesto', 'Garden City', 'Atlantic City', 'Longmont', 'Davis',
       'Morgan Hill', 'Clifton', 'Sheboygan', 'East Point', 'Rapid City',
       'Andover', 'Kissimmee', 'Shelton', 'Danbury', 'Sanford',
       'San Marcos', 'Greeley', 'Mansfield', 'Elyria', 'Twin Falls',
       'Coral Gables', 'Romeoville', 'Marlborough', 'Laurel', 'Bryan',
       'Pine Bluff', 'Aberdeen', 'Hagerstown', 'East Orange',
       'Arlington Heights', 'Oswego', 'Coon Rapids', 'San Clemente',
       'San Luis Obispo', 'Springdale', 'Lodi', 'Mason'], dtype=object)
In [25]:
df['State'].unique()
Out[25]:
array(['Kentucky', 'California', 'Florida', 'North Carolina',
       'Washington', 'Texas', 'Wisconsin', 'Utah', 'Nebraska',
       'Pennsylvania', 'Illinois', 'Minnesota', 'Michigan', 'Delaware',
       'Indiana', 'New York', 'Arizona', 'Virginia', 'Tennessee',
       'Alabama', 'South Carolina', 'Oregon', 'Colorado', 'Iowa', 'Ohio',
       'Missouri', 'Oklahoma', 'New Mexico', 'Louisiana', 'Connecticut',
       'New Jersey', 'Massachusetts', 'Georgia', 'Nevada', 'Rhode Island',
       'Mississippi', 'Arkansas', 'Montana', 'New Hampshire', 'Maryland',
       'District of Columbia', 'Kansas', 'Vermont', 'Maine',
       'South Dakota', 'Idaho', 'North Dakota', 'Wyoming',
       'West Virginia'], dtype=object)
In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Convert 'Ship Mode' column to categorical type
df['Ship Mode'] = df['Ship Mode'].astype('category')

# Plot count using seaborn
sns.countplot(data=df, x='Ship Mode', palette='Set2')
plt.show()
In [27]:
# Convert 'Segment' column to categorical type
df['Segment'] = df['Segment'].astype('category')

# Plot count using seaborn
sns.countplot(data=df, x='Segment', palette='Set2')
plt.show()
In [28]:
# sns.barplot(df['State'], df['Sales'], palette='Set2')
In [29]:
# sns.barplot(df['City'], df['Sales'], palette='Set2')
In [30]:
plt.figure(figsize=(30,10))
df['State'].value_counts().plot.bar()
plt.show()
In [31]:
# Convert 'Region' column to categorical type
df['Region'] = df['Region'].astype('category')

# Plot count using seaborn
sns.countplot(data=df, x='Region', palette='Set2')
plt.show()
In [32]:
# Convert 'Category' column to categorical type
df['Category'] = df['Category'].astype('category')

# Plot count using seaborn
sns.countplot(data=df, x='Category', palette='Set2')
plt.show()
In [33]:
plt.figure(figsize=(20,10))
sns.countplot(x=df['Sub-Category'], palette='Set2',)
plt.show()
In [34]:
df['Sales'].value_counts()
Out[34]:
12.960     55
19.440     37
15.552     37
10.368     35
25.920     34
           ..
4.240       1
319.960     1
646.740     1
81.940      1
243.160     1
Name: Sales, Length: 5825, dtype: int64
In [35]:
df['Sales'].value_counts().sum()
Out[35]:
9977
In [36]:
plt.figure(figsize=(30,10))
df['Sales'].plot(kind='line')
plt.show()
In [37]:
df['Quantity'].value_counts()
Out[37]:
3     2400
2     2399
5     1230
4     1186
1      899
7      606
6      572
9      258
8      257
10      57
11      34
14      29
13      27
12      23
Name: Quantity, dtype: int64
In [38]:
sns.countplot(x=df['Quantity'], palette='Set2')
plt.show()
In [39]:
sns.countplot(x=df['Discount'], palette='Set2')
plt.show()
In [40]:
df['Profit'].value_counts()
Out[40]:
0.0000     65
6.2208     42
9.3312     36
3.6288     31
5.4432     30
           ..
83.2508     1
16.1096     1
7.1988      1
1.6510      1
72.9480     1
Name: Profit, Length: 7287, dtype: int64
In [41]:
plt.figure(figsize=(30,10))
df['Profit'].plot(kind='line')
plt.show()
In [42]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True)
plt.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\2867204846.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(df.corr(), annot=True)
In [43]:
sns.pairplot(df, hue='Ship Mode')
plt.show()
In [44]:
ship_mode = df.groupby('Ship Mode')[['Profit', 'Sales']]
ship_mode.head()
Out[44]:
Profit Sales
0 41.9136 261.9600
1 219.5820 731.9400
2 6.8714 14.6200
3 -383.0310 957.5775
4 2.5164 22.3680
5 14.1694 48.8600
6 1.9656 7.2800
7 90.7152 907.1520
17 9.9900 55.5000
18 2.4824 8.5600
35 123.4737 1097.5440
36 -147.9630 190.9200
44 19.7714 45.9800
45 8.2062 17.4600
55 52.1400 208.5600
366 10.4400 23.2000
367 0.1472 7.3600
368 29.3412 104.7900
369 271.4192 1043.9200
657 -50.0980 701.3720
In [45]:
ship_mode.cov()
Out[45]:
Profit Sales
Ship Mode
First Class Profit 66500.973629 119609.940039
Sales 119609.940039 397622.361191
Same Day Profit 51482.952083 -6040.748196
Sales -6040.748196 308811.805446
Second Class Profit 23377.756306 38957.194668
Sales 38957.194668 312913.595466
Standard Class Profit 62640.769596 74363.646237
Sales 74363.646237 419100.307380
In [46]:
ship_mode.corr()
Out[46]:
Profit Sales
Ship Mode
First Class Profit 1.000000 0.735560
Sales 0.735560 1.000000
Same Day Profit 1.000000 -0.047908
Sales -0.047908 1.000000
Second Class Profit 1.000000 0.455485
Sales 0.455485 1.000000
Standard Class Profit 1.000000 0.458958
Sales 0.458958 1.000000
In [47]:
ship_mode.sum()
Out[47]:
Profit Sales
Ship Mode
First Class 48953.6561 3.513805e+05
Same Day 15871.8869 1.283217e+05
Second Class 57446.6516 4.591770e+05
Standard Class 163969.2280 1.357316e+06
In [48]:
# df.plot(kind='line', x='Sales', y='Profit')
In [49]:
sns.lineplot(data=df, x='Sales', y='Profit')
plt.show()
In [50]:
sns.pairplot(df, hue='State', palette='Set2')
plt.show()
In [51]:
# sns.pairplot(df, hue='Sales', palette='Set2')
In [52]:
# sns.pairplot(df, hue='Profit', palette='Set2')
In [53]:
sns.pairplot(df, hue='Discount', palette='Set2')
plt.show()
In [54]:
state = df.groupby(['State'])[['Sales', 'Discount', 'Profit']]
state.head()
Out[54]:
Sales Discount Profit
0 261.9600 0.00 41.9136
1 731.9400 0.00 219.5820
2 14.6200 0.00 6.8714
3 957.5775 0.45 -383.0310
4 22.3680 0.20 2.5164
... ... ... ...
7896 447.8400 0.00 219.4416
8798 1294.7500 0.00 336.6350
9146 99.9800 0.00 42.9914
9147 8.0400 0.00 2.7336
9486 673.3440 0.30 -76.9536

240 rows × 3 columns

In [55]:
state.mean()
Out[55]:
Sales Discount Profit
State
Alabama 319.846557 0.000000 94.865989
Arizona 157.508933 0.303571 -15.303235
Arkansas 194.635500 0.000000 66.811452
California 229.246629 0.072946 38.241878
Colorado 176.418231 0.316484 -35.867351
Connecticut 163.223866 0.007317 42.823071
Delaware 285.948635 0.006250 103.930988
District of Columbia 286.502000 0.000000 105.958930
Florida 233.612815 0.299347 -8.875461
Georgia 266.825217 0.000000 88.315453
Idaho 208.689810 0.085714 39.367767
Illinois 163.263823 0.389206 -25.665275
Indiana 359.431946 0.000000 123.375411
Iowa 152.658667 0.000000 39.460397
Kansas 121.429583 0.000000 34.851813
Kentucky 263.250000 0.000000 80.573357
Louisiana 219.453095 0.000000 52.288150
Maine 158.816250 0.000000 56.810775
Maryland 225.766886 0.005714 66.963608
Massachusetts 212.106919 0.015556 50.262975
Michigan 298.738756 0.007087 96.173584
Minnesota 335.541011 0.000000 121.608847
Mississippi 203.232830 0.000000 59.867475
Missouri 336.441667 0.000000 97.518341
Montana 372.623467 0.066667 122.221900
Nebraska 196.445526 0.000000 53.607742
Nevada 428.951333 0.061538 85.045279
New Hampshire 270.093481 0.011111 63.203807
New Jersey 275.110092 0.004615 75.176260
New Mexico 129.284378 0.059459 31.273408
New York 275.800489 0.055368 65.674767
North Carolina 223.305880 0.283534 -30.083985
North Dakota 131.415714 0.000000 32.878529
Ohio 166.617017 0.325000 -36.237859
Oklahoma 298.233182 0.000000 73.544788
Oregon 141.632374 0.289431 -9.708124
Pennsylvania 198.799253 0.328840 -26.562122
Rhode Island 404.070643 0.021429 130.100523
South Carolina 201.945476 0.000000 42.120395
South Dakota 109.630000 0.000000 32.902358
Tennessee 167.551219 0.291257 -29.189583
Texas 173.066675 0.370539 -26.196321
Utah 211.699170 0.060377 48.047802
Vermont 811.760909 0.000000 204.088936
Virginia 315.342500 0.000000 83.026564
Washington 276.017550 0.064542 66.470593
West Virginia 302.456000 0.075000 46.480400
Wisconsin 291.951000 0.000000 76.380004
Wyoming 1603.136000 0.200000 100.196000
In [56]:
state.std()
Out[56]:
Sales Discount Profit
State
Alabama 545.761807 0.000000 210.901876
Arizona 249.710692 0.197031 109.266267
Arkansas 316.405669 0.000000 123.249880
California 491.510940 0.095275 97.803399
Colorado 324.415072 0.208792 276.424024
Connecticut 251.732268 0.046562 66.138081
Delaware 1112.818485 0.043073 518.953513
District of Columbia 547.419707 0.000000 213.461353
Florida 1205.490630 0.192784 126.324165
Georgia 637.796228 0.000000 283.026694
Idaho 282.061093 0.101419 63.027976
Illinois 317.364546 0.244669 175.872237
Indiana 1481.538652 0.000000 693.643105
Iowa 326.637475 0.000000 73.763444
Kansas 124.606356 0.000000 42.619311
Kentucky 472.818538 0.000000 171.126404
Louisiana 365.264153 0.000000 83.357777
Maine 146.676370 0.000000 53.816911
Maryland 396.914182 0.041204 148.214655
Massachusetts 336.924698 0.066766 94.386407
Michigan 827.737778 0.025711 375.279834
Minnesota 1070.678744 0.000000 505.784116
Mississippi 365.834352 0.000000 80.793505
Missouri 830.866752 0.000000 308.032711
Montana 784.412796 0.097590 350.364144
Nebraska 459.444553 0.000000 133.163393
Nevada 881.267531 0.093514 265.901543
New Hampshire 475.181739 0.057735 119.893107
New Jersey 890.520072 0.037066 236.387912
New Mexico 190.093868 0.092675 39.290011
New York 691.990620 0.096589 232.640276
North Carolina 648.754454 0.178959 297.900997
North Dakota 256.602857 0.000000 58.123254
Ohio 353.599895 0.188295 341.709917
Oklahoma 431.874240 0.000000 109.026519
Oregon 215.583223 0.185902 74.744861
Pennsylvania 531.457950 0.189748 136.404709
Rhode Island 853.179916 0.077961 384.816449
South Carolina 318.462333 0.000000 73.698963
South Dakota 148.257140 0.000000 43.803938
Tennessee 360.678518 0.184956 176.636392
Texas 424.831382 0.243489 189.207658
Utah 378.723382 0.092694 83.929399
Vermont 1309.310745 0.000000 301.653921
Virginia 792.001296 0.000000 235.266720
Washington 780.008540 0.093596 326.560751
West Virginia 313.508627 0.150000 125.265494
Wisconsin 461.590111 0.000000 124.946125
Wyoming NaN NaN NaN
In [57]:
state.corr()
Out[57]:
Sales Discount Profit
State
Alabama Sales 1.000000 NaN 0.896932
Discount NaN NaN NaN
Profit 0.896932 NaN 1.000000
Arizona Sales 1.000000 -0.05654 -0.242726
Discount -0.056540 1.00000 -0.427620
... ... ... ... ...
Wisconsin Discount NaN NaN NaN
Profit 0.952947 NaN 1.000000
Wyoming Sales NaN NaN NaN
Discount NaN NaN NaN
Profit NaN NaN NaN

147 rows × 3 columns

In [58]:
plt.figure(figsize=(10,5))
sns.heatmap(state.corr().head(), annot=True)
plt.show()
In [59]:
sns.lineplot(data=df, x='Discount', y='Profit')
plt.show()
In [60]:
sns.barplot(data=df, x='Discount', y='Profit', palette='Set2')
plt.show()
In [61]:
sns.lineplot(data=df, x='Sales', y='Profit')
plt.show()
In [62]:
sns.lineplot(data=df, x='Sales', y='Discount')
plt.show()
In [63]:
city = df.groupby(['City'])[['Sales', 'Discount', 'Profit']].mean()
In [64]:
city = city.sort_values('Profit')
In [65]:
city.head()
Out[65]:
Sales Discount Profit
City
Bethlehem 337.926800 0.380000 -200.619160
Champaign 151.960000 0.600000 -182.352000
Oswego 107.326000 0.600000 -178.709200
Round Rock 693.436114 0.274286 -169.061614
Lancaster 215.031826 0.315217 -157.371052
In [66]:
# For High Profit Analysis

city['Profit'].tail(50).plot(kind='bar', figsize=(15,5), color='green')
plt.title('City Wise Sales Discount & Profit Analysis')
plt.show()
In [67]:
# For Low Profit Analysis

city['Profit'].head(50).plot(kind='bar', figsize=(15,5), color='red')
plt.title('City Wise Sales Discount & Profit Analysis')
plt.show()
In [68]:
quantity = df.groupby(['Quantity'])[['Sales', 'Discount', 'Profit']]
In [69]:
quantity = df.groupby(['Quantity'])[['Sales', 'Discount', 'Profit']].mean()
In [70]:
quantity.head(10)
Out[70]:
Sales Discount Profit
Quantity
1 59.234632 0.152959 8.276396
2 120.377982 0.154844 16.027769
3 175.630896 0.153321 23.720306
4 272.779827 0.158373 37.227971
5 337.936339 0.157146 40.257394
6 362.101960 0.166556 18.051517
7 395.888393 0.161980 56.579163
8 458.210802 0.171595 42.244342
9 498.083683 0.147946 68.557716
10 422.046737 0.190702 35.862404
In [71]:
# 1_Sales 2_Discount 3_Profit

quantity.plot.pie(subplots=True, autopct='%1.1f%%', figsize=(25,25),
                  pctdistance=0.69, startangle=90, shadow=True, labels=quantity.index)
plt.title('Quantity Wise Sales Discount & Profit Analysis', fontsize=30)
plt.show()
In [72]:
category = df.groupby(['Category'])[['Sales', 'Discount', 'Profit']]
In [73]:
category = df.groupby(['Category'])[['Sales', 'Discount', 'Profit']].mean()
In [74]:
category
Out[74]:
Sales Discount Profit
Category
Furniture 350.002981 0.174027 8.697740
Office Supplies 119.550107 0.157385 20.353403
Technology 452.709276 0.132323 78.752002
In [75]:
category.plot.pie(subplots=True, figsize=(18,20), autopct='%1.1f%%', labels=category.index)
plt.title('Category Wise Sales Discount & Profit Analysis', fontsize=15)
plt.show()
In [76]:
sub_category = df.groupby(['Sub-Category'])[['Sales', 'Discount', 'Profit']]
In [77]:
sub_category = df.groupby(['Sub-Category'])[['Sales', 'Discount', 'Profit']].mean()
In [78]:
sub_category
Out[78]:
Sales Discount Profit
Sub-Category
Accessories 215.974604 0.078452 54.111788
Appliances 230.755710 0.166524 38.922758
Art 34.096896 0.074969 8.207059
Binders 133.645972 0.372011 19.860710
Bookcases 503.859633 0.211140 -15.230509
Chairs 532.971969 0.170244 43.198582
Copiers 2198.941618 0.161765 817.909190
Envelopes 64.867724 0.080315 27.418019
Fasteners 13.936774 0.082028 4.375660
Furnishings 95.902745 0.138494 13.653476
Labels 34.283504 0.068871 15.224193
Machines 1645.553313 0.306087 29.432669
Paper 57.560075 0.074908 24.977365
Phones 371.211534 0.154556 50.073938
Storage 264.590553 0.074704 25.152277
Supplies 245.650200 0.076842 -6.258418
Tables 648.794771 0.261285 -55.565771
In [79]:
plt.figure(figsize=(25,25))
plt.pie(sub_category['Sales'], labels=sub_category.index, autopct='%1.1f%%')
plt.title('Sub_Category Wise Sales Profit & Discount Analysis', fontsize=30)
plt.legend()
plt.xticks(rotation=90)
plt.show()
In [80]:
plt.figure(figsize=(25,25))
plt.pie(sub_category['Discount'], labels=sub_category.index, autopct='%1.1f%%')
plt.title('Sub_Category Wise Discount Analysis', fontsize=30)
plt.legend()
plt.xticks(rotation=90)
plt.show()
In [81]:
sub_category.sort_values('Profit')[['Sales','Profit']].plot(kind='bar',
                                                       figsize=(20,10),
                                                       label=['Average Sales Price($)', 'Profit($)'])
plt.title('Sub_Category Wise Profit Analysis', fontsize=30)
plt.legend()
plt.show()
In [82]:
region = df.groupby(['Region'])[['Sales', 'Discount', 'Profit']]
In [83]:
region = df.groupby(['Region'])[['Sales', 'Discount', 'Profit']].mean()
In [84]:
region
Out[84]:
Sales Discount Profit
Region
Central 215.947759 0.240250 17.100421
East 238.465798 0.145343 32.163905
South 241.803645 0.147253 28.857673
West 227.139254 0.109615 33.927281
In [85]:
region.plot.pie(subplots=True, figsize=(25,20), autopct='%1.1f%%', labels=region.index)
plt.title('Region Wise Profit Analysis', fontsize=30)
plt.legend()
plt.show()
In [86]:
df.hist(bins=50 ,figsize=(20,15))
plt.show()
In [87]:
sns.pairplot(df, hue='Sub-Category', palette='Set2')
plt.show()
In [88]:
def state_data_viewer(states):
    
    product_data = df.groupby(['State'])
    for state in states:
        data = product_data.get_group(state).groupby(['Category'])
        fig, ax = plt.subplots(1,3, figsize=(28,5))
        fig.suptitle(state, fontsize=14)        
        ax_index = 0
        for cat in ['Furniture', 'Office Supplies', 'Technology']:
            cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
            sns.barplot(x=cat_data.Profit, y=cat_data.index, ax=ax[ax_index], palette='Set2')
            ax[ax_index].set_ylabel(cat)
            ax_index +=1
        fig.show()
In [89]:
states = ['California', 'Washington', 'Mississippi', 'Arizona', 'Texas']
state_data_viewer(states)
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
In [90]:
x = df.iloc[:, [9,10,11,12]].values

from sklearn.cluster import KMeans
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', 
                    max_iter = 300, n_init = 10, 
                    random_state=0).fit(x)
    wcss.append(kmeans.inertia_)
In [91]:
sns.set_style('whitegrid') 
sns.FacetGrid(df, hue='Sub-Category', height=6).map(plt.scatter, 'Sales', 'Quantity')
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
            s=100, c='yellow', label='Centroids')

plt.legend()
plt.show()
In [92]:
sns.set_style('whitegrid') 
sns.FacetGrid(df, hue='Sub-Category', height=6).map(plt.scatter, 'Sales', 'Profit')
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
            s=100, c='yellow', label='Centroids')

plt.legend()
plt.show()
In [93]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Sales'] , df['Profit'])
ax.set_xlabel('Sales')
ax.set_ylabel('Profit')
ax.set_title('Sales & Profit')
plt.show()